!sudo dpkg -i ~/tensorflow/personal-scratch/kake/cudnn/libcudnn7_7.3.1.20-1+cuda9.0_amd64.deb
!pip install -I --user tensorflow-gpu pretty_midi pypianoroll pandas
import IPython
import pretty_midi
import numpy as np
from time import time
import matplotlib.pyplot as plt
import pypianoroll as pproll
import tensorflow as tf
import pandas as pd
import numpy as np
tf.enable_eager_execution()
%matplotlib inline
Task framed as an autoregressive multilabel classification (That is, at each time step do a a multilabel classification conditioned on the classification choices done already).
While this is quite straight forward, there is a discrepancy between training (next step prediction) and the true objective (long term generation).
Ended up doing it anyway, at the end are more information on other techniques attempted, and some not attempted.
BATCH_SIZE = 32
NUM_LAYERS = 2
RNN_SIZE = 256
FEATURE_SIZE = 128
EPSILON = 1e-5
L2_WEIGHT = .001
EPOCHS = 20000
PIANOROLL_PATH = 'songs/*.csv'
COMPOSERS = ['mz', 'br', 'de', 'ba']
SAVE_DIR = 'composer/'
As the dataset is very small, we don't bother with truncated backpropagation and cache the pre batched matrices in memory.
with tf.device('cpu:0'):
mapping = tf.contrib.lookup.index_table_from_tensor(tf.constant(COMPOSERS))
def parse_numeric_csv(dataset):
''' numeric CSV parser '''
return (dataset
.map(lambda x: tf.sparse.to_dense(tf.string_split([x], '\n'), '')[0])
.map(lambda x: tf.sparse.to_dense(tf.string_split(x, ','), '0'))
.map(tf.strings.to_number))
def get_composer_ids(x):
''' filename to composer ids '''
file_name = tf.sparse.to_dense(tf.strings.split([x], '/'), '')[0, -1]
prefix = tf.strings.substr(file_name, 0, 2)
ids = mapping.lookup(prefix)
return ids
paths = tf.data.Dataset.list_files(PIANOROLL_PATH)
files = (paths
.map(tf.read_file)
.apply(parse_numeric_csv)
.map(lambda x: tf.concat([x[1:, 1:], 128. * tf.ones((128, 1))], axis=1))
.map(lambda x: (x, tf.shape(x)[-1])))
composers = paths.map(get_composer_ids)
ds = tf.data.Dataset.zip((composers, files))
ds = (ds.cache()
.repeat(EPOCHS)
.padded_batch(BATCH_SIZE, padded_shapes=((), ([FEATURE_SIZE, -1], ()))))
ds = ds.apply(tf.data.experimental.prefetch_to_device('gpu:0'))
class BinarizedNeuralComposer(tf.keras.Model):
def __init__(self, rnn_size, feature_size, composers):
super(BinarizedNeuralComposer, self).__init__()
self.rnn_size = rnn_size
self.feature_size = feature_size
self.composers = composers
self.rnn = tf.contrib.cudnn_rnn.CudnnLSTM(NUM_LAYERS, self.rnn_size, dropout=0.2)
self.comp_emb_c = tf.get_variable(
'comp_emb_c',(len(self.composers) + 1, NUM_LAYERS, self.rnn_size))
self.comp_emb_h = tf.get_variable(
'comp_emb_h',(len(self.composers) + 1, NUM_LAYERS, self.rnn_size))
self.projection = tf.layers.Dense(self.feature_size)
def call(self, data, comp_id=-1, state=None, training=True):
if state is None:
h = tf.nn.embedding_lookup(params=self.comp_emb_h, ids=comp_id)
c = tf.nn.embedding_lookup(params=self.comp_emb_c, ids=comp_id) # BATCH x LAYERS x RNN_SIZE
h = tf.transpose(h, [1, 0, 2])
c = tf.transpose(c, [1, 0, 2])
state = (h, c)
out, state = self.rnn(data, state, training=training)
logits = self.projection(out)
return logits, state
model = BinarizedNeuralComposer(RNN_SIZE, FEATURE_SIZE, COMPOSERS)
optimizer = tf.train.AdamOptimizer(.00005)
root = tf.train.Checkpoint(optimizer=optimizer,
model=model,
optimizer_step=tf.train.get_or_create_global_step())
checkpoint = tf.train.latest_checkpoint(SAVE_DIR)
status = root.restore(checkpoint)
print(checkpoint)
# reg = tf.contrib.layers.l2_regularizer(L2_WEIGHT)
for comp_id, (data, length) in ds:
begin = time()
data = tf.transpose(data, [2, 0, 1]) # batch x keys x time -> time x batch x keys
data = tf.to_float(tf.not_equal(data, tf.zeros_like(data))) # binarize
x = data[:-1]
y = data[1:]
seq_length = length - 1
length_mask = tf.expand_dims(tf.transpose(tf.to_float(tf.sequence_mask(seq_length))), -1)
# Cool scheduled sampling technique which didn't fix much
'''
y_hat, _ = model(x, comp_id)
x_hat = tf.concat([x[:1], tf.nn.sigmoid(y_hat[:-1])], axis=0)
x_hat = tf.where(x_hat > .1, tf.ones_like(x_hat), tf.zeros_like(x_hat))
x_hat = tf.where(tf.random.uniform(tf.shape(x_hat)) > .5, x, x_hat)
'''
x_hat = x
# Train generalist / specialist 50 / 50
comp_id = tf.where(.5 > tf.random.uniform(tf.shape(comp_id)), comp_id, -1 * tf.ones_like(comp_id))
with tf.GradientTape() as tape:
y_hat, _ = model(x_hat, comp_id)
loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=y, logits=y_hat, weights=length_mask)
# loss += tf.contrib.layers.apply_regularization(reg, [model.projection.weights[0], model.comp_emb])
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables),
global_step=tf.train.get_or_create_global_step())
if tf.train.get_or_create_global_step().numpy() % 500 == 0:
print(tf.train.get_or_create_global_step().numpy(), time() - begin, tf.reduce_sum(loss).numpy())
root.save(SAVE_DIR + 'finalfinal_2_256')
# Helper functions copied from github.com/zehsilva/neural-composer-assignement
def piano_roll_to_pretty_midi(piano_roll, fs=100, program=1):
'''Convert a Piano Roll array into a PrettyMidi object
with a single instrument.
Parameters
----------
piano_roll : np.ndarray, shape=(128,frames), dtype=int
Piano roll of one instrument
fs : int
Sampling frequency of the columns, i.e. each column is spaced apart
by ``1./fs`` seconds.
program : int
The program number of the instrument.
Returns
-------
midi_object : pretty_midi.PrettyMIDI
A pretty_midi.PrettyMIDI class instance describing
the piano roll.
'''
notes, frames = piano_roll.shape
pm = pretty_midi.PrettyMIDI()
instrument = pretty_midi.Instrument(program=program)
# pad 1 column of zeros so we can acknowledge inital and ending events
piano_roll = np.pad(piano_roll, [(0, 0), (1, 1)], 'constant')
# use changes in velocities to find note on / note off events
velocity_changes = np.nonzero(np.diff(piano_roll).T)
# keep track on velocities and note on times
prev_velocities = np.zeros(notes, dtype=int)
note_on_time = np.zeros(notes)
for time, note in zip(*velocity_changes):
# use time + 1 because of padding above
velocity = piano_roll[note, time + 1]
time = time / fs
if velocity > 0:
if prev_velocities[note] == 0:
note_on_time[note] = time
prev_velocities[note] = velocity
else:
pm_note = pretty_midi.Note(
velocity=prev_velocities[note],
pitch=note,
start=note_on_time[note],
end=time)
instrument.notes.append(pm_note)
prev_velocities[note] = 0
pm.instruments.append(instrument)
return pm
def visualize_piano_roll(pianoroll_matrix,fs=5):
""" input: piano roll matrix with shape (number of notes, time steps)
effect: generates a nice graph with the piano roll visualization
"""
if(pianoroll_matrix.shape[0]==128):
pianoroll_matrix=pianoroll_matrix.T.astype(float)
track = pproll.Track(pianoroll=pianoroll_matrix, program=0, is_drum=False, name='piano roll')
# Plot the piano-roll
fig, ax = track.plot(beat_resolution=fs)
plt.show()
The model trained very slow, so I tried a few tricks to simplify the problem. I also through a little about making the training objective more similar to the generative use case.
Things tried
Things that would be fun to try
# Utils for reading piano roll file and sample songs from model
def show_and_tell(path, composer_ids=-1, length=30, fs=5):
print('Original song')
song = read(path)
render_pianoroll(song[:length * fs,0,:])
render_audio(song, length)
print('Generalist')
composed = compose(song, [-1], start_seconds=5, length_seconds=length - 5, fs=fs).numpy()
render_pianoroll(composed[:length * fs,0,:])
render_audio(composed, length)
for composer_id in composer_ids:
print('Specialist:', COMPOSERS[composer_id])
composed = compose(song, [composer_id], start_seconds=5, length_seconds=length - 5, fs=fs).numpy()
render_pianoroll(composed[:length * fs,0,:])
render_audio(composed, length)
def read(path):
piano_roll = pd.read_csv(path) # Read CSV
piano_roll = piano_roll.values[:,1:] # Remove indices
piano_roll = tf.transpose(piano_roll)
piano_roll = tf.expand_dims(piano_roll, 1)
piano_roll = tf.minimum(1., piano_roll) # binarize
return piano_roll.numpy()
def render_pianoroll(song):
plt.figure(figsize=(15,5))
plt.imshow(song.transpose(), cmap='hot')
plt.show()
visualize_piano_roll(song)
def render_audio(song, length):
pm = piano_roll_to_pretty_midi(song[:,0,:].transpose(), fs=5)
signal = pm.synthesize()
IPython.display.display(IPython.display.Audio(signal[:44100*length], rate=44100))
def compose(song, composer_id, start_seconds=5, length_seconds=20, fs=5):
seed = song[:fs * start_seconds]
y_hats, state = model(seed, composer_id, training=False)
y_hat = y_hats[-1:]
y_hat = round_to_zero(y_hat)
result = [seed, y_hat]
for _ in range(fs * length_seconds):
y_hat, state = model(y_hat, composer_id, state, training=False)
y_hat = round_to_zero(y_hat)
result.append(y_hat)
return tf.concat(result, axis=0)
def round_to_zero(x, threshold=.1):
x = tf.nn.sigmoid(x)
return tf.where(x > threshold, tf.ones_like(x), tf.zeros_like(x))
show_and_tell('songs/bach_847.csv', composer_ids=[COMPOSERS.index('ba'), COMPOSERS.index('de')])
show_and_tell('unseen_songs/debussy_prel.csv', composer_ids=[COMPOSERS.index('ba'), COMPOSERS.index('de')])